load("~/Dropbox/Thesis_PhD/data/camera_sparql/acts_camera_senato_gmi.RData")

parsePDF <- function(path_to_pdf) {
  
  require(tm)
  pdf <- readPDF(engine = "xpdf")
  tm_doc <- pdf(elem = list(uri=path_to_pdf), language='it', id='id1')
  char <- paste(tm_doc[[1]], collapse = '\n')
  
  return(char)
  
}

# Senato

senato_files <- list.files("~/Downloads/senato_pdf", full.names = TRUE)

gmi_senato_bills_wt_text <- data.frame()

for (i in 1:length(senato_files)) {
  require(stringr)
  this_resource <- str_extract(senato_files[i], "\\d{2,10}")
  this_text <- parsePDF(senato_files[i])
  
  gmi_senato_bills_wt_text <- rbind(gmi_senato_bills_wt_text, 
                                    data.frame(act_resource = this_resource, 
                                               act_text = this_text,
                                               act_date = unique(with(gmi_senato_bills, 
                                                                      date[grepl(this_resource, ddl)])),
                                               stringsAsFactors = FALSE))
}

# Camera

# Acts
require(SPARQL)
endpoint <- "http://dati.camera.it/sparql"
query <- "
select distinct ?act_resource ?act_title ?act_text ?act_date where {
?act_resource dc:date ?act_date.
?act_resource dc:type ?act_type.
?act_resource dc:title ?act_title.
?act_resource dc:description ?act_text
FILTER(REGEX(?act_text,'(reddito minimo garantito|reddito (minimo )?di cittadinanza)','i'))
} LIMIT 10000
"

qd <- SPARQL(endpoint, query)
gmi_camera_acts_wt_text <- qd$results

gmi_camera_acts_wt_text$act_text <- with(gmi_camera_acts_wt_text, 
                                         paste(act_title, act_text))

gmi_camera_acts_wt_text$act_title <- NULL

# Bills

uris <- unique(gmi_camera_bills$act_resource)
uris <- gsub("<|>", "", uris)
uris <- paste0(uris,"?output=application%2Frdf%2Bxml")

for (uri in uris[12:14]) {
  require(XML)
  require(RCurl)
  xml_doc <- getURL(uri)
  act_resource <- str_extract(uri,"(?<=.rdf/)(.*)(?=\\?)")
  data <- xmlParse(xml_doc)
  xml_data <- xmlToList(data) 
  pdf_url <- xml_data[['Description']][['relation']][['resource']]
  if (is.null(pdf_url)) {
    next
  }
  download.file(pdf_url, paste0("/users/francesco/Downloads/camera_pdf/",act_resource,".pdf"))
}

camera_files <- list.files("~/Downloads/camera_pdf", full.names = TRUE)

gmi_camera_bills_wt_text <- data.frame()

for (i in 1:length(camera_files)) {
  require(stringr)
  this_resource <- str_extract(camera_files[i], "(?<=pdf/)(.*)(?=.pdf)")
  this_text <- parsePDF(camera_files[i])
  
  gmi_camera_bills_wt_text <- rbind(gmi_camera_bills_wt_text, 
                                    data.frame(act_resource = this_resource, 
                                               act_text = this_text,
                                               act_title = unique(with(gmi_camera_bills, 
                                                                       act_title[grepl(this_resource, 
                                                                                      act_resource)])), 
                                               act_date = unique(with(gmi_camera_bills, 
                                                                      act_date[grepl(this_resource, 
                                                                                 act_resource)]))))
}

gmi_camera_bills_wt_text$act_date <- as.Date(gmi_camera_bills_wt_text$act_date, format = "%Y%m%d")
gmi_camera_acts_wt_text$act_date <- 
  as.Date(substr(gmi_camera_acts_wt_text$act_date, 1, 8), format = "%Y%m%d")
gmi_senato_bills_wt_text$act_date <- 
  as.Date(gmi_senato_bills_wt_text$act_date)

gmi_camera_bills_wt_text$act_text <- as.character(gmi_camera_bills_wt_text$act_text)
gmi_senato_bills_wt_text$act_text <- as.character(gmi_senato_bills_wt_text$act_text)
gmi_camera_acts_wt_text$act_text <- as.character(gmi_camera_acts_wt_text$act_text)

save(gmi_camera_bills_wt_text, gmi_senato_bills_wt_text, gmi_camera_acts_wt_text,
     file = "m5s_parliament_gmi_act_texts.RData")

